#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr) ; library(kableExtra)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(WGCNA)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd) and clustering (pipeline in 05_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)

genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))


clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]

load('./../Data/GSEA.RData')
GSEA_old_SFARI = enrichment_old_SFARI
GSEA_SFARI = enrichment_SFARI

load('./../Data/ORA.RData')
ORA_old_SFARI = enrichment_old_SFARI
ORA_SFARI = enrichment_SFARI

rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds, enrichment_DGN, enrichment_DO, enrichment_GO,
   enrichment_KEGG, enrichment_Reactome, enrichment_SFARI)


Selecting Top Modules


We have results both from GSEA and ORA to measure the enrichment of SFARI Genes in each module, and they both agree with each other relatively well

SFARI_genes_by_module = c()

for(module in names(GSEA_old_SFARI)){
  
  GSEA_info = GSEA_old_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust, NES) %>%
              mutate(pvalue = ifelse(NES>0, pvalue, 1-pvalue), 
                     p.adjust = ifelse(NES>0, p.adjust, 1)) %>%
              dplyr::rename('GSEA_pval' = pvalue, 'GSEA_padj'= p.adjust)
  
  ORA_info = ORA_old_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust, qvalue, GeneRatio, Count) %>%
             dplyr::rename('ORA_pval' = pvalue, 'ORA_padj' = p.adjust)
  
  module_info = GSEA_info %>% full_join(ORA_info, by = 'ID') %>% add_column(.before = 'ID', Module = module)
  
  SFARI_genes_by_module = rbind(SFARI_genes_by_module, module_info)
}

SFARI_genes_by_module = SFARI_genes_by_module %>% 
                        left_join(dataset %>% dplyr::select(Module, MTcor) %>% 
                                  group_by(Module,MTcor) %>% tally %>% ungroup, by = 'Module') %>%
                        mutate(ORA_pval = ifelse(is.na(ORA_pval), 1, ORA_pval),
                               ORA_padj = ifelse(is.na(ORA_padj), 1, ORA_padj))

plot_data = SFARI_genes_by_module %>% filter(ID=='SFARI')

ggplotly(plot_data %>% ggplot(aes(1-GSEA_pval, 1-ORA_pval, size = n)) + 
         geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) + 
         geom_smooth(se=FALSE, color = '#CCCCCC') + 
         xlab('GSEA Enrichment') + ylab('ORA Enrichment') + coord_fixed() +
         ggtitle(paste0('Corr = ', round(cor(plot_data$GSEA_pval, plot_data$ORA_pval),2))) +
         theme_minimal() + theme(legend.position = 'none'))



To determine which modules have a statistically significant enrichment in SFARI Genes we can use the adjusted p-values. We used the Bonferroni correction for this.

GSEA identifies 20/55 as significant. This doesn’t make sense

ggplotly(plot_data %>% ggplot(aes(GSEA_padj, ORA_padj, size = n)) + 
         geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) + 
         geom_smooth(se=FALSE, color = '#CCCCCC') + 
         geom_hline(yintercept = 0.01, color = 'gray', linetype = 'dashed') +
         geom_vline(xintercept = 0.01, color = 'gray', linetype = 'dashed') +
         xlab('GSEA adjusted p-value') + ylab('ORA adjusted p-value') + 
         scale_x_log10(limits = c(min(plot_data$GSEA_padj, plot_data$ORA_padj),1.2)) + 
         scale_y_log10(limits = c(min(plot_data$GSEA_padj, plot_data$ORA_padj),1.2)) +
         ggtitle(paste0('Corr = ',round(cor(plot_data$GSEA_padj, plot_data$ORA_padj),2))) + coord_fixed() +
         theme_minimal() + theme(legend.position = 'none'))
plot_data = plot_data %>% mutate(GSEA_sig = GSEA_padj<0.01, ORA_sig = ORA_padj<0.01) %>%
            apply_labels(GSEA_sig = 'GSEA significant enrichment',
                         ORA_sig = 'ORA significant enrichment')

cro(plot_data$GSEA_sig, list(plot_data$ORA_sig, total()))
 ORA significant enrichment     #Total 
 FALSE   TRUE   
 GSEA significant enrichment 
   FALSE  35   35
   TRUE  17 3   20
   #Total cases  52 3   55



The ‘over-enrichment’ in SFARI Modules in GSEA could be because SFARI Genes have in general higher Module Memberships than the other genes, which would make them cluster at the beginning of the list constantly and would bias the enrichment analysis.

Looking at the plot below, we can see that there is not a uniform distribution of SFARI genes across all quantiles of the Module Membership values, but they instead seem to cluster around Module Membership values with high magnitudes (both positive and negative), so I don’t think the GSEA results for the SFARI genes are valid.

Because of this, I’m going to use the enrichment from the ORA to study the SFARI Genes

quant_data = dataset %>% dplyr::select(ID, contains('MM.')) %>% 
             left_join(genes_info %>% dplyr::select(ID, gene.score), by = 'ID') %>% dplyr::select(-ID) %>%
             melt %>% mutate(quant = cut(value, breaks = quantile(value, probs = seq(0,1,0.05)) %>% 
                                     as.vector, labels = FALSE)) %>%
             group_by(gene.score, quant) %>% tally %>% ungroup %>% ungroup
  
quant_data = quant_data %>% group_by(quant) %>% summarise(N = sum(n)) %>% ungroup %>% 
            left_join(quant_data, by = 'quant') %>% dplyr::select(quant, gene.score, n, N) %>% 
            mutate(p = round(100*n/N,2)) %>% filter(!is.na(quant)) %>%
            mutate(gene.score = factor(gene.score, levels=rev(c('1','2','3','4','5','6','Neuronal','Others'))))

ggplotly(quant_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>% 
         ggplot(aes(quant, p, fill = gene.score)) + geom_bar(stat='identity') + 
         xlab('Module Membership Quantiles') + ylab('% of SFARI Genes in Quantile') +
         ggtitle('Percentage of Genes labelled as SFARI in each Quantile') +
         scale_fill_manual(values = SFARI_colour_hue(r=rev(c(1:6)))) + 
         theme_minimal() + theme(legend.position = 'none'))
rm(quant_data)


Selecting modules with an adjusted p-value below 0.01 using the ORA

ggplotly(plot_data %>% ggplot(aes(MTcor, ORA_padj, size=n)) + 
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) +
         geom_hline(yintercept = 0.01, color = 'gray', linetype = 'dotted') + 
         xlab('Module-Diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         ggtitle('Modules Significantly Enriched in SFARI Genes') +
         theme_minimal() + theme(legend.position = 'none'))
top_modules = plot_data %>% arrange(desc(ORA_padj)) %>% filter(ORA_padj<0.01) %>% pull(Module) %>% as.character

plot_data %>% filter(Module %in% top_modules) %>% arrange(ORA_pval) %>%
              dplyr::select(Module, MTcor, ORA_pval, ORA_padj, qvalue, GeneRatio, Count) %>%
              rename( ORA_pval = 'p-value', ORA_padj = 'Adjusted p-value') %>%
              kable %>% kable_styling(full_width = F)
Module MTcor p-value Adjusted p-value qvalue GeneRatio Count
#96A900 0.0721036 5.30e-06 0.0000368 0.0000119 26/173 26
#00C0BF -0.5573039 1.83e-05 0.0001465 0.0000964 39/336 39
#44A0FF 0.6175450 5.48e-05 0.0003839 0.0001732 56/574 56

We lose Module #00BADE, which was significantly enriched with the new SFARI genes. The other three modules were also found to be significant in the new SFARI Genes, so details about these modules can be found in 08_to_SFARI_modules.html (I’m not going to repeat them here since there’s no new information)



ORA New vs Old SFARI Genes


In general, there isn’t a big change in SFARI enrichment between the two versions of the SFARI Genes

SFARI_genes_by_module = c()

for(module in names(ORA_old_SFARI)){
  
  ORA_old_info = ORA_old_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust) %>%
                 dplyr::rename('pval_old_SFARI' = pvalue, 'padj_old_SFARI' = p.adjust)
  
  ORA_info = ORA_SFARI[[module]] %>% dplyr::select(ID, pvalue, p.adjust) %>%
             dplyr::rename('pval_SFARI' = pvalue, 'padj_SFARI' = p.adjust)
  
  module_info = ORA_old_info %>% full_join(ORA_info, by = 'ID') %>% add_column(.before = 'ID', Module = module)
  
  SFARI_genes_by_module = rbind(SFARI_genes_by_module, module_info)
}

SFARI_genes_by_module = SFARI_genes_by_module %>% 
                        left_join(dataset %>% dplyr::select(Module, MTcor) %>% 
                                  group_by(Module,MTcor) %>% tally %>% ungroup, by = 'Module') %>%
                        mutate(pval_old_SFARI = ifelse(is.na(pval_old_SFARI), 1, pval_old_SFARI),
                               pval_SFARI = ifelse(is.na(pval_SFARI), 1, pval_SFARI),
                               padj_old_SFARI = ifelse(is.na(padj_old_SFARI), 1, padj_old_SFARI),
                               padj_SFARI = ifelse(is.na(padj_SFARI), 1, padj_SFARI))

plot_data = SFARI_genes_by_module %>% filter(ID=='SFARI')

ggplotly(plot_data %>% ggplot(aes(1-pval_old_SFARI, 1-pval_SFARI, size = n)) + 
         geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) + 
         geom_abline(slope = 1, intercept = 0, color = '#CCCCCC', linetype = 'dotted') + 
         xlab('Enrichment Old SFARI Genes') + ylab('Enrichment New SFARI Genes') + coord_fixed() +
         ggtitle(paste0('Corr = ', round(cor(plot_data$pval_old_SFARI, plot_data$pval_SFARI),2))) +
         theme_minimal() + theme(legend.position = 'none'))


ggplotly(plot_data %>% ggplot(aes(padj_old_SFARI, padj_SFARI, size = n)) + 
         geom_point(color = plot_data$Module, alpha = .7, aes(id=Module)) + 
         geom_hline(yintercept = 0.01, color = 'gray', linetype = 'dashed') +
         geom_vline(xintercept = 0.01, color = 'gray', linetype = 'dashed') +
         geom_abline(slope = 1, intercept = 0, color = '#CCCCCC', linetype = 'dotted') + 
         xlab('Adjusted p-value Old SFARI Genes') + ylab('Adjusted p-value New SFARI Genes') + coord_fixed() +
         scale_x_log10(limits = c(min(plot_data$padj_old_SFARI, plot_data$padj_SFARI),1.2)) + 
         scale_y_log10(limits = c(min(plot_data$padj_old_SFARI, plot_data$padj_SFARI),1.2)) + 
         ggtitle(paste0('Corr = ', round(cor(plot_data$padj_old_SFARI, plot_data$padj_SFARI),2))) +
         theme_minimal() + theme(legend.position = 'none'))


Conclusion


Enrichment doesn’t change much between SFARI Gene versions, sharing 3/4 statistically enriched modules and losing one just by a few points in its adjusted p-value, so the conclusions that we drew from the new SFARI Genes can be extended to the old SFARI Genes as well



Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] WGCNA_1.69             fastcluster_1.1.25     dynamicTreeCut_1.63-1 
##  [4] org.Hs.eg.db_3.8.2     AnnotationDbi_1.46.1   IRanges_2.18.3        
##  [7] S4Vectors_0.22.1       Biobase_2.44.0         BiocGenerics_0.30.0   
## [10] DOSE_3.10.2            ReactomePA_1.28.0      clusterProfiler_3.12.0
## [13] biomaRt_2.40.5         kableExtra_1.1.0       knitr_1.28            
## [16] doParallel_1.0.15      iterators_1.0.12       foreach_1.5.0         
## [19] polycor_0.7-10         expss_0.10.2           ggpubr_0.2.5          
## [22] magrittr_1.5           GGally_1.5.0           gridExtra_2.3         
## [25] viridis_0.5.1          viridisLite_0.3.0      RColorBrewer_1.1-2    
## [28] dendextend_1.13.4      plotly_4.9.2           glue_1.4.1            
## [31] reshape2_1.4.4         forcats_0.5.0          stringr_1.4.0         
## [34] dplyr_1.0.0            purrr_0.3.4            readr_1.3.1           
## [37] tidyr_1.1.0            tibble_3.0.1           ggplot2_3.3.2         
## [40] tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0            RSQLite_2.2.0              
##   [3] htmlwidgets_1.5.1           grid_3.6.3                 
##   [5] BiocParallel_1.18.1         munsell_0.5.0              
##   [7] codetools_0.2-16            preprocessCore_1.46.0      
##   [9] withr_2.2.0                 colorspace_1.4-1           
##  [11] GOSemSim_2.10.0             highr_0.8                  
##  [13] rstudioapi_0.11             ggsignif_0.6.0             
##  [15] labeling_0.3                urltools_1.7.3             
##  [17] GenomeInfoDbData_1.2.1      polyclip_1.10-0            
##  [19] bit64_0.9-7                 farver_2.0.3               
##  [21] vctrs_0.3.1                 generics_0.0.2             
##  [23] xfun_0.12                   R6_2.4.1                   
##  [25] GenomeInfoDb_1.20.0         graphlayouts_0.7.0         
##  [27] locfit_1.5-9.4              DelayedArray_0.10.0        
##  [29] bitops_1.0-6                reshape_0.8.8              
##  [31] fgsea_1.10.1                gridGraphics_0.5-0         
##  [33] assertthat_0.2.1            scales_1.1.1               
##  [35] ggraph_2.0.3                nnet_7.3-14                
##  [37] enrichplot_1.4.0            gtable_0.3.0               
##  [39] tidygraph_1.2.0             rlang_0.4.6                
##  [41] genefilter_1.66.0           splines_3.6.3              
##  [43] lazyeval_0.2.2              acepack_1.4.1              
##  [45] impute_1.58.0               broom_0.5.5                
##  [47] europepmc_0.4               checkmate_2.0.0            
##  [49] BiocManager_1.30.10         yaml_2.2.1                 
##  [51] modelr_0.1.6                crosstalk_1.1.0.1          
##  [53] backports_1.1.8             qvalue_2.16.0              
##  [55] Hmisc_4.4-0                 tools_3.6.3                
##  [57] ggplotify_0.0.5             ellipsis_0.3.1             
##  [59] ggridges_0.5.2              Rcpp_1.0.4.6               
##  [61] plyr_1.8.6                  base64enc_0.1-3            
##  [63] progress_1.2.2              zlibbioc_1.30.0            
##  [65] RCurl_1.98-1.2              prettyunits_1.1.1          
##  [67] rpart_4.1-15                cowplot_1.0.0              
##  [69] SummarizedExperiment_1.14.1 haven_2.2.0                
##  [71] ggrepel_0.8.2               cluster_2.1.0              
##  [73] fs_1.4.0                    data.table_1.12.8          
##  [75] DO.db_2.9                   triebeard_0.3.0            
##  [77] reprex_0.3.0                reactome.db_1.68.0         
##  [79] matrixStats_0.56.0          xtable_1.8-4               
##  [81] hms_0.5.3                   evaluate_0.14              
##  [83] XML_3.99-0.3                jpeg_0.1-8.1               
##  [85] readxl_1.3.1                compiler_3.6.3             
##  [87] crayon_1.3.4                htmltools_0.4.0            
##  [89] mgcv_1.8-31                 Formula_1.2-3              
##  [91] geneplotter_1.62.0          lubridate_1.7.4            
##  [93] DBI_1.1.0                   tweenr_1.0.1               
##  [95] dbplyr_1.4.2                MASS_7.3-51.6              
##  [97] rappdirs_0.3.1              Matrix_1.2-18              
##  [99] cli_2.0.2                   igraph_1.2.5               
## [101] GenomicRanges_1.36.1        pkgconfig_2.0.3            
## [103] rvcheck_0.1.8               foreign_0.8-76             
## [105] xml2_1.2.5                  annotate_1.62.0            
## [107] webshot_0.5.2               XVector_0.24.0             
## [109] rvest_0.3.5                 digest_0.6.25              
## [111] graph_1.62.0                rmarkdown_2.1              
## [113] cellranger_1.1.0            fastmatch_1.1-0            
## [115] htmlTable_1.13.3            curl_4.3                   
## [117] graphite_1.30.0             lifecycle_0.2.0            
## [119] nlme_3.1-147                jsonlite_1.7.0             
## [121] fansi_0.4.1                 pillar_1.4.4               
## [123] lattice_0.20-41             httr_1.4.1                 
## [125] survival_3.1-12             GO.db_3.8.2                
## [127] UpSetR_1.4.0                png_0.1-7                  
## [129] bit_1.1-15.2                ggforce_0.3.1              
## [131] stringi_1.4.6               blob_1.2.1                 
## [133] DESeq2_1.24.0               latticeExtra_0.6-29        
## [135] memoise_1.1.0